Here we present an example analysis of 10 million PBMCs treated with cytokines and processed through Parse Biosciences Evercode workflow. We use the python package Scanpy. We will demonstrate steps starting from loading in the data, to preprocessing, and finally to clustering.¶

In [1]:
import os
import sys
import scipy
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.io as sio
import scanpy.external as sce
import matplotlib.pyplot as plt
import time

sc.settings.verbosity = 1
obj_save_path = '/10M_dataset/analysis/'

# Adjust Scanpy figure defaults
sc.settings.set_figure_params(dpi=100, fontsize=10, dpi_save=400,
    facecolor = 'white', figsize=(6,6), format='png')

Function to get std across rows in a sparse_csr matrix. This will help with replacing standard Scanpy Scale and PCA functions in order to use less memory.¶

In [2]:
def sparse_std(X,blocksize=100):
    """
    Return the std deviation of the columns of a sparse matrix.
    
    inputs
        X: sparse matrix

        blocksize: number of columns to calculate in parallel. Larger
        blocksize will increase memory usage as this function converts 
        each block to a dense matrix and uses the numpy .std() function
    
    outputs
        X_std: numpy vector with std deviation of each column of X
    
    """
    n,m = X.shape
    blocksize = 100
    k = int(m/blocksize)
    X_std = []
    for i in range(k):
        X_std += list(X[:,blocksize*i:blocksize*(i+1)].todense().std(0).A1)
    X_std += list(X[:,k*blocksize:].todense().std(0).A1)
    X_std = np.array(X_std)
    return X_std

IRLB functions to replace normal scanpy scale and pca steps in order to reduce memory usage¶

In [3]:
#   irlbpy code
#       From:   https://github.com/airysen/irlbpy
#       Date:   2021-12
#       License: Apache License, V 2.0, January 2004
#
#       Code unmodified except:
#           Added two print (feedback) blocks
#           Ran through lint formatting tool 'black' https://github.com/psf/black
#
import numpy as np
import scipy.sparse as sparse
import warnings

from numpy.fft import rfft, irfft
import numpy.linalg as nla


# Matrix-vector product wrapper
# A is a numpy 2d array or matrix, or a scipy matrix or sparse matrix.
# x is a numpy vector only.
# Compute A.dot(x) if t is False,  A.transpose().dot(x)  otherwise.


def multA(A, x, TP=False, L=None):
    if sparse.issparse(A):
        # m = A.shape[0]
        # n = A.shape[1]
        if TP:
            return sparse.csr_matrix(x).dot(A).transpose().todense().A[:, 0]
        return A.dot(sparse.csr_matrix(x).transpose()).todense().A[:, 0]
    if TP:
        return x.dot(A)
    return A.dot(x)


def multS(s, v, L, TP=False):
    N = s.shape[0]
    vp = prepare_v(v, N, L, TP=TP)
    p = irfft(rfft(vp) * rfft(s))
    if not TP:
        return p[:L]
    return p[L - 1 :]


def prepare_s(s, L=None):
    N = s.shape[0]
    if L is None:
        L = N // 2
    K = N - L + 1
    return np.roll(s, K - 1)


def prepare_v(v, N, L, TP=False):
    v = v.flatten()[::-1]
    K = N - L + 1
    if TP:
        lencheck = L
        if v.shape[0] != lencheck:
            raise VectorLengthException(
                "Length of v must be  L (if transpose flag is True)"
            )
        pw = K - 1
        v = np.pad(v, (pw, 0), mode="constant", constant_values=0)
    elif not TP:
        lencheck = N - L + 1
        if v.shape[0] != lencheck:
            raise VectorLengthException("Length of v must be N-K+1")
        pw = L - 1
        v = np.pad(v, (0, pw), mode="constant", constant_values=0)
    return v


def orthog(Y, X):
    """Orthogonalize a vector or matrix Y against the columns of the matrix X.
    This function requires that the column dimension of Y is less than X and
    that Y and X have the same number of rows.
    """
    dotY = multA(X, Y, TP=True)
    return Y - multA(X, dotY)


# Simple utility function used to check linear dependencies during computation:


def invcheck(x):
    # eps2 = 2 * np.finfo(np.float).eps
    eps2 = 2 * np.finfo(float).eps
    if x > eps2:
        x = 1 / x
    else:
        x = 0
        warnings.warn("Ill-conditioning encountered, result accuracy may be poor")
    return x


def lanczos(A, nval, tol=0.0001, maxit=50, center=None, scale=None, L=None):
    """Estimate a few of the largest singular values and corresponding singular
    vectors of matrix using the implicitly restarted Lanczos bidiagonalization
    method of Baglama and Reichel, see:

    Augmented Implicitly Restarted Lanczos Bidiagonalization Methods,
    J. Baglama and L. Reichel, SIAM J. Sci. Comput. 2005

    Keyword arguments:
    tol   -- An estimation tolerance. Smaller means more accurate estimates.
    maxit -- Maximum number of Lanczos iterations allowed.

    Given an input matrix A of dimension j * k, and an input desired number
    of singular values n, the function returns a tuple X with five entries:

    X[0] A j * nu matrix of estimated left singular vectors.
    X[1] A vector of length nu of estimated singular values.
    X[2] A k * nu matrix of estimated right singular vectors.
    X[3] The number of Lanczos iterations run.
    X[4] The number of matrix-vector products run.

    The algorithm estimates the truncated singular value decomposition:
    A.dot(X[2]) = X[0]*X[1].
    """

    import sys

    print(
        f">> lanczos A={A.shape}, nval={nval}, tol={tol}, maxit={maxit}",
        file=sys.stdout,
    )
    center_story = "None" if center is None else f"{center.shape}"
    scale_story = "None" if scale is None else f"{scale.shape}"
    print(f"++ lanczos center={center_story}, scale={scale_story}", file=sys.stdout)
    sys.stdout.flush()

    mmult = None
    m = None
    n = None
    if A.ndim == 2:
        mmult = multA
        m = A.shape[0]
        n = A.shape[1]
        if min(m, n) < 2:
            raise MatrixShapeException("The input matrix must be at least 2x2.")

    elif A.ndim == 1:
        mmult = multS
        A = np.pad(A, (0, A.shape[0] % 2), mode="edge")
        N = A.shape[0]
        if L is None:
            L = N // 2
        K = N - L + 1
        m = L
        n = K
        A = prepare_s(A, L)
    elif A.ndim > 2:
        raise MatrixShapeException("The input matrix must be 2D array")
    nu = nval

    m_b = min((nu + 20, 3 * nu, n))  # Working dimension size
    mprod = 0
    it = 0
    j = 0
    k = nu
    smax = 1
    # sparse = sparse.issparse(A)

    V = np.zeros((n, m_b))
    W = np.zeros((m, m_b))
    F = np.zeros((n, 1))
    B = np.zeros((m_b, m_b))

    V[:, 0] = np.random.randn(n)  # Initial vector
    V[:, 0] = V[:, 0] / np.linalg.norm(V)

    while it < maxit:
        if it > 0:
            j = k

        VJ = V[:, j]

        # apply scaling
        if scale is not None:
            VJ = VJ / scale

        W[:, j] = mmult(A, VJ, L=L)
        mprod = mprod + 1

        # apply centering
        # R code: W[, j_w] <- W[, j_w] - ds * drop(cross(dv, VJ)) * du
        if center is not None:
            W[:, j] = W[:, j] - np.dot(center, VJ)

        if it > 0:
            # NB W[:,0:j] selects columns 0,1,...,j-1
            W[:, j] = orthog(W[:, j], W[:, 0:j])
        s = np.linalg.norm(W[:, j])
        sinv = invcheck(s)
        W[:, j] = sinv * W[:, j]

        # Lanczos process
        while j < m_b:
            F = mmult(A, W[:, j], TP=True, L=L)
            mprod = mprod + 1

            # apply scaling
            if scale is not None:
                F = F / scale

            F = F - s * V[:, j]
            F = orthog(F, V[:, 0 : j + 1])
            fn = np.linalg.norm(F)
            fninv = invcheck(fn)
            F = fninv * F
            if j < m_b - 1:
                V[:, j + 1] = F
                B[j, j] = s
                B[j, j + 1] = fn
                VJp1 = V[:, j + 1]

                # apply scaling
                if scale is not None:
                    VJp1 = VJp1 / scale

                W[:, j + 1] = mmult(A, VJp1, L=L)
                mprod = mprod + 1

                # apply centering
                # R code: W[, jp1_w] <- W[, jp1_w] - ds * drop(cross(dv, VJP1))
                # * du
                if center is not None:
                    W[:, j + 1] = W[:, j + 1] - np.dot(center, VJp1)

                # One step of classical Gram-Schmidt...
                W[:, j + 1] = W[:, j + 1] - fn * W[:, j]
                # ...with full reorthogonalization
                W[:, j + 1] = orthog(W[:, j + 1], W[:, 0 : (j + 1)])
                s = np.linalg.norm(W[:, j + 1])
                sinv = invcheck(s)
                W[:, j + 1] = sinv * W[:, j + 1]
            else:
                B[j, j] = s
            j = j + 1
        # End of Lanczos process
        S = nla.svd(B)
        R = fn * S[0][m_b - 1, :]  # Residuals
        if it == 0:
            smax = S[1][0]  # Largest Ritz value
        else:
            smax = max((S[1][0], smax))

        conv = sum(np.abs(R[0:nu]) < tol * smax)
        if conv < nu:  # Not coverged yet
            k = max(conv + nu, k)
            k = min(k, m_b - 3)
        else:
            break
        # Update the Ritz vectors
        V[:, 0:k] = V[:, 0:m_b].dot(S[2].transpose()[:, 0:k])
        V[:, k] = F
        B = np.zeros((m_b, m_b))
        # Improve this! There must be better way to assign diagonal...
        for l in range(k):
            B[l, l] = S[1][l]
        B[0:k, k] = R[0:k]
        # Update the left approximate singular vectors
        W[:, 0:k] = W[:, 0:m_b].dot(S[0][:, 0:k])
        it = it + 1

    U = W[:, 0:m_b].dot(S[0][:, 0:nu])
    V = V[:, 0:m_b].dot(S[2].transpose()[:, 0:nu])
    # return((U, S[1][0:nu], V, it, mprod))

    print(f"<< lanczos it={it} mprod={mprod}", file=sys.stdout)
    sys.stdout.flush()

    return LanczosResult(
        **{"U": U, "s": S[1][0:nu], "V": V, "steps": it, "nmult": mprod}
    )


class LanczosResult:
    def __init__(self, **kwargs):
        for key in kwargs:
            setattr(self, key, kwargs[key])


class VectorLengthException(Exception):
    pass


class MatrixShapeException(Exception):
    pass

Read in anndata object containing about 10 million PBMCs that have been treated with various cytokines and processed through the Parse Biosciences Evercode workflow in a single experiment.¶

In [4]:
%%time
adata = sc.read_h5ad(obj_save_path + "Parse_10M_PBMC_cytokines.h5ad")
CPU times: user 32.6 s, sys: 3min 11s, total: 3min 43s
Wall time: 13min 40s

To simplify a first pass of the tutorial, we want to demonstrate the workflow on a smaller subset of the full dataset. We will demonstrate the workflow on a downsampled object of 1 million cells.¶

In [5]:
%%time
# Set a random seed for reproducibility
random_seed = 42 

# Number of cells you want to downsample to
num_cells = 1000000 

# Check if the requested number of cells is less than the total cells in adata
if num_cells < adata.shape[0]:
    # Randomly select the indices for downsampling
    np.random.seed(random_seed)
    random_indices = np.random.choice(adata.shape[0], num_cells, replace=False)
    # Create the downsampled AnnData object
    adata = adata[random_indices, :].copy()
else:
    print("The requested number of cells is greater than or equal to the total number of cells in the dataset.")

# Now adata contains the downsampled data
CPU times: user 6.57 s, sys: 4.74 s, total: 11.3 s
Wall time: 11.3 s
In [6]:
adata
Out[6]:
AnnData object with n_obs × n_vars = 1000000 × 40352
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells'
In [7]:
%%time
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
CPU times: user 20.7 s, sys: 1.13 s, total: 21.8 s
Wall time: 21.8 s
In [8]:
adata
Out[8]:
AnnData object with n_obs × n_vars = 1000000 × 40352
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells'
    uns: 'log1p'
In [9]:
%%time
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.25)
sc.pl.highly_variable_genes(adata) 

# Save raw expression values before variable gene subset
adata.raw = adata
No description has been provided for this image
CPU times: user 42.8 s, sys: 15.8 s, total: 58.6 s
Wall time: 39.4 s
In [10]:
%%time
adata = adata[:, adata.var.highly_variable].copy()
CPU times: user 9.76 s, sys: 2.87 s, total: 12.6 s
Wall time: 12.6 s
In [11]:
adata
Out[11]:
AnnData object with n_obs × n_vars = 1000000 × 5632
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

Replace scanpy.pp.scale and scanpy.pp.pca with the following code to use less memory¶

In [12]:
%%time
use_hv = True
adata.uns['pca'] = {}
adata.uns['pca']['params'] = {
    'zero_center': True,
    'use_highly_variable': use_hv,
}

adata.var['mean'] = adata.X.mean(0).A1
adata.var['std'] = sparse_std(adata.X)

S = lanczos(adata.X,50,center=adata.var['mean'],scale=adata.var['std'])

adata.obsm['X_pca'] = (S.U * S.s)
adata.varm['PCs'] = S.V
adata.uns['pca']['variance'] = adata.obsm['X_pca'].var(0)
adata.uns['pca']['variance_ratio'] = adata.uns['pca']['variance'] / (adata.X.shape[1] - 1 )
>> lanczos A=(1000000, 5632), nval=50, tol=0.0001, maxit=50
++ lanczos center=(5632,), scale=(5632,)
<< lanczos it=18 mprod=248
CPU times: user 35min 30s, sys: 1min 39s, total: 37min 10s
Wall time: 9min 25s
In [13]:
obj_save_path = "/10M_dataset/analysis/re_cluster_after_doublet_cluster_removal/"
adata.write(obj_save_path + 'adata_post_pca.h5ad')
In [14]:
adata
Out[14]:
AnnData object with n_obs × n_vars = 1000000 × 5632
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

Continue with neighbors, umap, and leiden¶

In [15]:
%%time
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
/home/ubuntu/miniconda3/envs/spipe/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
CPU times: user 4min, sys: 10.1 s, total: 4min 10s
Wall time: 4min 3s
In [16]:
adata
Out[16]:
AnnData object with n_obs × n_vars = 1000000 × 5632
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors'
    obsm: 'X_pca'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [17]:
%%time
sc.tl.umap(adata)
CPU times: user 5h 1min 41s, sys: 20.2 s, total: 5h 2min 1s
Wall time: 11min 36s
In [18]:
adata
Out[18]:
AnnData object with n_obs × n_vars = 1000000 × 5632
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [ ]:
adata.write(obj_save_path + 'adata_post_umap.h5ad')
In [20]:
%%time
sc.tl.leiden(adata, flavor="igraph", n_iterations=2)
CPU times: user 1min 9s, sys: 4.65 s, total: 1min 13s
Wall time: 1min 13s
In [21]:
adata
Out[21]:
AnnData object with n_obs × n_vars = 1000000 × 5632
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [ ]:
%%time
adata.write(obj_save_path + 'adata_post_leiden.h5ad')

Plotting¶

In [24]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8)
No description has been provided for this image
CPU times: user 3.01 s, sys: 49 ms, total: 3.06 s
Wall time: 3.05 s
In [25]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8, legend_loc = "on data")
No description has been provided for this image
CPU times: user 2.78 s, sys: 47 ms, total: 2.82 s
Wall time: 2.82 s
In [26]:
%%time
sc.pl.umap(adata, color='donor', legend_fontsize=8)
No description has been provided for this image
CPU times: user 2.83 s, sys: 46 ms, total: 2.88 s
Wall time: 2.88 s

If it is desirable to remove the batch effects of having different PBMC donors, one can run harmony integration. Due to technical variation and differences in underlying biology between PBMC donors, integrating with harmony can help users more easily identify which clusters correspond to which cell type.¶

In [28]:
%%time
sce.pp.harmony_integrate(adata, 'donor')
2024-10-28 21:55:02,127 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2024-10-28 21:55:41,668 - harmonypy - INFO - sklearn.KMeans initialization complete.
2024-10-28 21:55:44,326 - harmonypy - INFO - Iteration 1 of 10
2024-10-28 21:59:58,634 - harmonypy - INFO - Iteration 2 of 10
2024-10-28 22:04:15,760 - harmonypy - INFO - Iteration 3 of 10
2024-10-28 22:08:27,919 - harmonypy - INFO - Iteration 4 of 10
2024-10-28 22:12:41,974 - harmonypy - INFO - Iteration 5 of 10
2024-10-28 22:16:57,073 - harmonypy - INFO - Iteration 6 of 10
2024-10-28 22:21:17,081 - harmonypy - INFO - Iteration 7 of 10
2024-10-28 22:22:40,212 - harmonypy - INFO - Converged after 7 iterations
CPU times: user 10h 30min 12s, sys: 7min 54s, total: 10h 38min 7s
Wall time: 27min 38s
In [29]:
adata
Out[29]:
AnnData object with n_obs × n_vars = 1000000 × 5632
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'donor_colors', 'cell_type_colors'
    obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [30]:
%%time
#adata.write(obj_save_path + 'adata_post_harmony.h5ad')
CPU times: user 6 μs, sys: 0 ns, total: 6 μs
Wall time: 11.9 μs
In [ ]:
 

Continue with neighbors. To use the results of harmony integration, we set use_rep = "X_pca_harmony"¶

In [31]:
%%time
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30, use_rep="X_pca_harmony")
CPU times: user 3min 27s, sys: 5.67 s, total: 3min 33s
Wall time: 3min 27s
In [32]:
adata
Out[32]:
AnnData object with n_obs × n_vars = 1000000 × 5632
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'donor_colors', 'cell_type_colors'
    obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

Continue with umap and leiden clustering.¶

In [33]:
%%time
sc.tl.umap(adata)
CPU times: user 4h 22min 6s, sys: 7.41 s, total: 4h 22min 14s
Wall time: 10min 44s
In [34]:
adata
Out[34]:
AnnData object with n_obs × n_vars = 1000000 × 5632
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'donor', 'cytokine', 'treatment', 'cell_type', 'leiden'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'donor_colors', 'cell_type_colors'
    obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [35]:
%%time
#adata.write(obj_save_path + 'adata_post_harmony_post_umap.h5ad')
CPU times: user 6 μs, sys: 0 ns, total: 6 μs
Wall time: 12.2 μs
In [36]:
%%time
sc.tl.leiden(adata, resolution=1.0, flavor="igraph", n_iterations=2)
CPU times: user 1min 6s, sys: 3.79 s, total: 1min 9s
Wall time: 1min 9s
In [ ]:
 
In [37]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8, legend_loc = "on data")
No description has been provided for this image
CPU times: user 2.71 s, sys: 41 ms, total: 2.76 s
Wall time: 2.75 s
In [38]:
%%time
sc.pl.umap(adata, color='leiden', legend_fontsize=8)
No description has been provided for this image
CPU times: user 2.98 s, sys: 19 ms, total: 3 s
Wall time: 3 s

We now plot by donor to show the results of harmony integration.¶

In [39]:
%%time
sc.pl.umap(adata, color='donor', legend_fontsize=8)
No description has been provided for this image
CPU times: user 2.84 s, sys: 25 ms, total: 2.87 s
Wall time: 2.87 s

The first and most notable effect of running the integration function to is seeing how the samples overlap with another, which simplifies the process of annotating cell types as it reduces the number of clusters. In this case, we don't see a drastic reduction in numbers of clusters, but in scenarios where we see greater technical and biological variation, this can have a much greater impact.¶

We now overlay cell type annotations on the umap plot, which were determined using standard marker genes for PBMCs.¶

In [40]:
%%time
sc.pl.umap(adata, color='cell_type', legend_fontsize=8, legend_loc = "on data")
No description has been provided for this image
CPU times: user 2.74 s, sys: 20 ms, total: 2.76 s
Wall time: 2.76 s
In [41]:
%%time
sc.pl.umap(adata, color='cell_type', legend_fontsize=8)
No description has been provided for this image
CPU times: user 2.92 s, sys: 19 ms, total: 2.94 s
Wall time: 2.94 s
In [ ]: